{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/wucwu1/anaconda3/lib/python3.7/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n",
      "/home/wucwu1/anaconda3/lib/python3.7/site-packages/sklearn/grid_search.py:42: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.\n",
      "  DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "#导入必要的工具包\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import xgboost as xgb\n",
    "from xgboost.sklearn import XGBClassifier\n",
    "from sklearn import cross_validation, metrics   #Additional scklearn functions\n",
    "from sklearn.grid_search import GridSearchCV   #Perforing grid search\n",
    "\n",
    "import matplotlib.pylab as plt\n",
    "%matplotlib inline\n",
    "from matplotlib.pylab import rcParams\n",
    "rcParams['figure.figsize'] = 12, 4\n",
    "\n",
    "train = pd.read_csv('train_modified.csv')\n",
    "target = 'Disbursed'\n",
    "IDcol = 'ID'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "#利用XGBoost完成任务\n",
    "def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):\n",
    "    \n",
    "    if useTrainCV:\n",
    "        xgb_param = alg.get_xgb_params()\n",
    "        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)\n",
    "        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,\n",
    "            metrics='auc', early_stopping_rounds=early_stopping_rounds)\n",
    "        alg.set_params(n_estimators=cvresult.shape[0])\n",
    "    \n",
    "    #Fit the algorithm on the data\n",
    "    alg.fit(dtrain[predictors], dtrain['Disbursed'],eval_metric='auc')\n",
    "        \n",
    "    #Predict training set:\n",
    "    dtrain_predictions = alg.predict(dtrain[predictors])\n",
    "    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]\n",
    "        \n",
    "    #Print model report:\n",
    "    print (\"\\nModel Report\")\n",
    "    print (\"Accuracy : %.4g\" % metrics.accuracy_score(dtrain['Disbursed'].values, dtrain_predictions))\n",
    "    print (\"AUC Score (Train): %f\" % metrics.roc_auc_score(dtrain['Disbursed'], dtrain_predprob))\n",
    "                    \n",
    "    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)\n",
    "    feat_imp.plot(kind='bar', title='Feature Importances')\n",
    "    plt.ylabel('Feature Importance Score')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Model Report\n",
      "Accuracy : 0.9854\n",
      "AUC Score (Train): 0.891681\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/wucwu1/anaconda3/lib/python3.7/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n",
      "  if diff:\n"
     ]
    },
    {
     "ename": "TypeError",
     "evalue": "'str' object is not callable",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-6-a968ea7dba8e>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     12\u001b[0m  \u001b[0mscale_pos_weight\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     13\u001b[0m  seed=27)\n\u001b[0;32m---> 14\u001b[0;31m \u001b[0mmodelfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mxgb1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrain\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpredictors\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m<ipython-input-5-0bc6edfa63bd>\u001b[0m in \u001b[0;36mmodelfit\u001b[0;34m(alg, dtrain, predictors, useTrainCV, cv_folds, early_stopping_rounds)\u001b[0m\n\u001b[1;32m     20\u001b[0m     \u001b[0mprint\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m\"AUC Score (Train): %f\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0mmetrics\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mroc_auc_score\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdtrain\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Disbursed'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtrain_predprob\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     21\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 22\u001b[0;31m     \u001b[0mfeat_imp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSeries\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0malg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbooster\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_fscore\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msort_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mascending\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     23\u001b[0m     \u001b[0mfeat_imp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mplot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkind\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'bar'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtitle\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'Feature Importances'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     24\u001b[0m     \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mylabel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Feature Importance Score'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mTypeError\u001b[0m: 'str' object is not callable"
     ]
    }
   ],
   "source": [
    "#learning_rate,n_estimators采用默认值\n",
    "predictors = [x for x in train.columns if x not in [target, IDcol]]\n",
    "xgb1 = XGBClassifier(\n",
    " learning_rate =0.1,\n",
    " n_estimators=1000,\n",
    " max_depth=5,\n",
    " min_child_weight=1,\n",
    " gamma=0,\n",
    " subsample=0.8,\n",
    " colsample_bytree=0.8,\n",
    " objective= 'binary:logistic',\n",
    " nthread=4,\n",
    " scale_pos_weight=1,\n",
    " seed=27)\n",
    "modelfit(xgb1, train, predictors)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "([mean: 0.83764, std: 0.00875, params: {'max_depth': 3, 'min_child_weight': 1},\n",
       "  mean: 0.83768, std: 0.00797, params: {'max_depth': 3, 'min_child_weight': 6},\n",
       "  mean: 0.83886, std: 0.00815, params: {'max_depth': 3, 'min_child_weight': 2},\n",
       "  mean: 0.82120, std: 0.00684, params: {'max_depth': 10, 'min_child_weight': 1},\n",
       "  mean: 0.82979, std: 0.00677, params: {'max_depth': 10, 'min_child_weight': 6},\n",
       "  mean: 0.82443, std: 0.00772, params: {'max_depth': 10, 'min_child_weight': 2},\n",
       "  mean: 0.83066, std: 0.01341, params: {'max_depth': 2, 'min_child_weight': 1},\n",
       "  mean: 0.83002, std: 0.01384, params: {'max_depth': 2, 'min_child_weight': 6},\n",
       "  mean: 0.83063, std: 0.01344, params: {'max_depth': 2, 'min_child_weight': 2}],\n",
       " {'max_depth': 3, 'min_child_weight': 2},\n",
       " 0.83885970524183)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#调优参数：max_depth、min_child_weight\n",
    "param_test1 = {\n",
    " 'max_depth':[3,10,2],\n",
    " 'min_child_weight':[1,6,2]\n",
    "}\n",
    "gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5,\n",
    " min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,\n",
    " objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), \n",
    " param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=5)\n",
    "gsearch1.fit(train[predictors],train[target])\n",
    "gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "([mean: 0.84034, std: 0.00601, params: {'max_depth': 4, 'min_child_weight': 4},\n",
       "  mean: 0.83921, std: 0.00658, params: {'max_depth': 4, 'min_child_weight': 5},\n",
       "  mean: 0.84003, std: 0.00622, params: {'max_depth': 4, 'min_child_weight': 6},\n",
       "  mean: 0.84071, std: 0.00553, params: {'max_depth': 5, 'min_child_weight': 4},\n",
       "  mean: 0.83935, std: 0.00548, params: {'max_depth': 5, 'min_child_weight': 5},\n",
       "  mean: 0.83871, std: 0.00492, params: {'max_depth': 5, 'min_child_weight': 6},\n",
       "  mean: 0.83926, std: 0.00302, params: {'max_depth': 6, 'min_child_weight': 4},\n",
       "  mean: 0.83717, std: 0.00483, params: {'max_depth': 6, 'min_child_weight': 5},\n",
       "  mean: 0.83729, std: 0.00605, params: {'max_depth': 6, 'min_child_weight': 6}],\n",
       " {'max_depth': 5, 'min_child_weight': 4},\n",
       " 0.8407073731353547)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#调优参数：max_depth、min_child_weight\n",
    "param_test2 = {\n",
    " 'max_depth':[4,5,6],\n",
    " 'min_child_weight':[4,5,6]\n",
    "}\n",
    "gsearch2 = GridSearchCV(estimator = XGBClassifier( learning_rate=0.1, n_estimators=140, max_depth=5,\n",
    " min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,\n",
    " objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), \n",
    " param_grid = param_test2, scoring='roc_auc',n_jobs=4,iid=False, cv=5)\n",
    "gsearch2.fit(train[predictors],train[target])\n",
    "gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "([mean: 0.84003, std: 0.00622, params: {'gamma': 0.0},\n",
       "  mean: 0.84017, std: 0.00594, params: {'gamma': 0.1},\n",
       "  mean: 0.83963, std: 0.00624, params: {'gamma': 0.2},\n",
       "  mean: 0.83974, std: 0.00692, params: {'gamma': 0.3},\n",
       "  mean: 0.83996, std: 0.00568, params: {'gamma': 0.4}],\n",
       " {'gamma': 0.1},\n",
       " 0.8401672862430356)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#调优参数：gamma\n",
    "param_test3 = {\n",
    " 'gamma':[i/10.0 for i in range(0,5)]\n",
    "}\n",
    "gsearch3 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=4,\n",
    " min_child_weight=6, gamma=0, subsample=0.8, colsample_bytree=0.8,\n",
    " objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), \n",
    " param_grid = param_test3, scoring='roc_auc',n_jobs=4,iid=False, cv=5)\n",
    "gsearch3.fit(train[predictors],train[target])\n",
    "gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Model Report\n",
      "Accuracy : 0.9854\n",
      "AUC Score (Train): 0.882846\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/wucwu1/anaconda3/lib/python3.7/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n",
      "  if diff:\n"
     ]
    },
    {
     "ename": "TypeError",
     "evalue": "'str' object is not callable",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-13-5c8b95fb5336>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     11\u001b[0m  \u001b[0mscale_pos_weight\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     12\u001b[0m  seed=27)\n\u001b[0;32m---> 13\u001b[0;31m \u001b[0mmodelfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mxgb2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrain\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpredictors\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m<ipython-input-5-0bc6edfa63bd>\u001b[0m in \u001b[0;36mmodelfit\u001b[0;34m(alg, dtrain, predictors, useTrainCV, cv_folds, early_stopping_rounds)\u001b[0m\n\u001b[1;32m     20\u001b[0m     \u001b[0mprint\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m\"AUC Score (Train): %f\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0mmetrics\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mroc_auc_score\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdtrain\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Disbursed'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtrain_predprob\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     21\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 22\u001b[0;31m     \u001b[0mfeat_imp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSeries\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0malg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbooster\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_fscore\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msort_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mascending\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     23\u001b[0m     \u001b[0mfeat_imp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mplot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkind\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'bar'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtitle\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'Feature Importances'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     24\u001b[0m     \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mylabel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Feature Importance Score'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mTypeError\u001b[0m: 'str' object is not callable"
     ]
    }
   ],
   "source": [
    "#learning_rate,n_estimators采用调优值\n",
    "xgb2 = XGBClassifier(\n",
    " learning_rate =0.1,\n",
    " n_estimators=1000,\n",
    " max_depth=4,\n",
    " min_child_weight=6,\n",
    " gamma=0,\n",
    " subsample=0.8,\n",
    " colsample_bytree=0.8,\n",
    " objective= 'binary:logistic',\n",
    " nthread=4,\n",
    " scale_pos_weight=1,\n",
    " seed=27)\n",
    "modelfit(xgb2, train, predictors)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "([mean: 0.84023, std: 0.00870, params: {'colsample_bytree': 0.6, 'subsample': 0.6},\n",
       "  mean: 0.83791, std: 0.00788, params: {'colsample_bytree': 0.6, 'subsample': 0.7},\n",
       "  mean: 0.83783, std: 0.00718, params: {'colsample_bytree': 0.6, 'subsample': 0.8},\n",
       "  mean: 0.84060, std: 0.00698, params: {'colsample_bytree': 0.6, 'subsample': 0.9},\n",
       "  mean: 0.84029, std: 0.00784, params: {'colsample_bytree': 0.7, 'subsample': 0.6},\n",
       "  mean: 0.83903, std: 0.00648, params: {'colsample_bytree': 0.7, 'subsample': 0.7},\n",
       "  mean: 0.83918, std: 0.00774, params: {'colsample_bytree': 0.7, 'subsample': 0.8},\n",
       "  mean: 0.84055, std: 0.00668, params: {'colsample_bytree': 0.7, 'subsample': 0.9},\n",
       "  mean: 0.84168, std: 0.00817, params: {'colsample_bytree': 0.8, 'subsample': 0.6},\n",
       "  mean: 0.83917, std: 0.00795, params: {'colsample_bytree': 0.8, 'subsample': 0.7},\n",
       "  mean: 0.84070, std: 0.00665, params: {'colsample_bytree': 0.8, 'subsample': 0.8},\n",
       "  mean: 0.83993, std: 0.00571, params: {'colsample_bytree': 0.8, 'subsample': 0.9},\n",
       "  mean: 0.83958, std: 0.00804, params: {'colsample_bytree': 0.9, 'subsample': 0.6},\n",
       "  mean: 0.83944, std: 0.00721, params: {'colsample_bytree': 0.9, 'subsample': 0.7},\n",
       "  mean: 0.83936, std: 0.00560, params: {'colsample_bytree': 0.9, 'subsample': 0.8},\n",
       "  mean: 0.84162, std: 0.00701, params: {'colsample_bytree': 0.9, 'subsample': 0.9}],\n",
       " {'colsample_bytree': 0.8, 'subsample': 0.6},\n",
       " 0.8416829668102807)"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#调优参数：colsample_bytree、subsample\n",
    "param_test4 = {\n",
    " 'subsample':[i/10.0 for i in range(6,10)],\n",
    " 'colsample_bytree':[i/10.0 for i in range(6,10)]\n",
    "}\n",
    "gsearch4 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=4,\n",
    " min_child_weight=6, gamma=0, subsample=0.8, colsample_bytree=0.8,\n",
    " objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), \n",
    " param_grid = param_test4, scoring='roc_auc',n_jobs=4,iid=False, cv=5)\n",
    "gsearch4.fit(train[predictors],train[target])\n",
    "gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=5, error_score='raise',\n",
       "       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n",
       "       colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_delta_step=0,\n",
       "       max_depth=4, min_child_weight=6, missing=None, n_estimators=177,\n",
       "       n_jobs=1, nthread=4, objective='binary:logistic', random_state=0,\n",
       "       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=27, silent=True,\n",
       "       subsample=0.8),\n",
       "       fit_params={}, iid=False, n_jobs=4,\n",
       "       param_grid={'subsample': [0.75, 0.8, 0.85], 'colsample_bytree': [0.75, 0.8, 0.85]},\n",
       "       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=0)"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#调优参数：colsample_bytree、subsample\n",
    "param_test5 = {\n",
    " 'subsample':[i/100.0 for i in range(75,90,5)],\n",
    " 'colsample_bytree':[i/100.0 for i in range(75,90,5)]\n",
    "}\n",
    "gsearch5 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=4,\n",
    " min_child_weight=6, gamma=0, subsample=0.8, colsample_bytree=0.8,\n",
    " objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), \n",
    " param_grid = param_test5, scoring='roc_auc',n_jobs=4,iid=False, cv=5)\n",
    "gsearch5.fit(train[predictors],train[target])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "([mean: 0.83949, std: 0.00720, params: {'reg_alpha': 1e-05},\n",
       "  mean: 0.83940, std: 0.00607, params: {'reg_alpha': 0.01},\n",
       "  mean: 0.83976, std: 0.00684, params: {'reg_alpha': 0.1},\n",
       "  mean: 0.84078, std: 0.00753, params: {'reg_alpha': 1},\n",
       "  mean: 0.81217, std: 0.01559, params: {'reg_alpha': 100}],\n",
       " {'reg_alpha': 1},\n",
       " 0.8407817095214378)"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#调优参数：reg_alpha\n",
    "param_test6 = {\n",
    " 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]\n",
    "}\n",
    "gsearch6 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=4,\n",
    " min_child_weight=6, gamma=0.1, subsample=0.8, colsample_bytree=0.8,\n",
    " objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), \n",
    " param_grid = param_test6, scoring='roc_auc',n_jobs=4,iid=False, cv=5)\n",
    "gsearch6.fit(train[predictors],train[target])\n",
    "gsearch6.grid_scores_, gsearch6.best_params_, gsearch6.best_score_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "([mean: 0.83949, std: 0.00720, params: {'reg_alpha': 0},\n",
       "  mean: 0.83949, std: 0.00720, params: {'reg_alpha': 0.001},\n",
       "  mean: 0.83999, std: 0.00658, params: {'reg_alpha': 0.005},\n",
       "  mean: 0.83940, std: 0.00607, params: {'reg_alpha': 0.01},\n",
       "  mean: 0.83994, std: 0.00728, params: {'reg_alpha': 0.05}],\n",
       " {'reg_alpha': 0.005},\n",
       " 0.8399865207609579)"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#调优参数：reg_alpha\n",
    "param_test7 = {\n",
    " 'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05]\n",
    "}\n",
    "gsearch7 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=4,\n",
    " min_child_weight=6, gamma=0.1, subsample=0.8, colsample_bytree=0.8,\n",
    " objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), \n",
    " param_grid = param_test7, scoring='roc_auc',n_jobs=4,iid=False, cv=5)\n",
    "gsearch7.fit(train[predictors],train[target])\n",
    "gsearch7.grid_scores_, gsearch7.best_params_, gsearch7.best_score_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Model Report\n",
      "Accuracy : 0.9854\n",
      "AUC Score (Train): 0.882692\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/wucwu1/anaconda3/lib/python3.7/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n",
      "  if diff:\n"
     ]
    },
    {
     "ename": "TypeError",
     "evalue": "'str' object is not callable",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-18-39af2058ca0b>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     12\u001b[0m  \u001b[0mscale_pos_weight\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     13\u001b[0m  seed=27)\n\u001b[0;32m---> 14\u001b[0;31m \u001b[0mmodelfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mxgb3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrain\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpredictors\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m<ipython-input-5-0bc6edfa63bd>\u001b[0m in \u001b[0;36mmodelfit\u001b[0;34m(alg, dtrain, predictors, useTrainCV, cv_folds, early_stopping_rounds)\u001b[0m\n\u001b[1;32m     20\u001b[0m     \u001b[0mprint\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m\"AUC Score (Train): %f\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0mmetrics\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mroc_auc_score\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdtrain\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Disbursed'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtrain_predprob\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     21\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 22\u001b[0;31m     \u001b[0mfeat_imp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSeries\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0malg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbooster\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_fscore\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msort_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mascending\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     23\u001b[0m     \u001b[0mfeat_imp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mplot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkind\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'bar'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtitle\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'Feature Importances'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     24\u001b[0m     \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mylabel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Feature Importance Score'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mTypeError\u001b[0m: 'str' object is not callable"
     ]
    }
   ],
   "source": [
    "#采用正则化和CV\n",
    "xgb3 = XGBClassifier(\n",
    " learning_rate =0.1,\n",
    " n_estimators=1000,\n",
    " max_depth=4,\n",
    " min_child_weight=6,\n",
    " gamma=0,\n",
    " subsample=0.8,\n",
    " colsample_bytree=0.8,\n",
    " reg_alpha=0.005,\n",
    " objective= 'binary:logistic',\n",
    " nthread=4,\n",
    " scale_pos_weight=1,\n",
    " seed=27)\n",
    "modelfit(xgb3, train, predictors)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/wucwu1/anaconda3/lib/python3.7/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n",
      "  if diff:\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Model Report\n",
      "Accuracy : 0.9854\n",
      "AUC Score (Train): 0.886021\n"
     ]
    },
    {
     "ename": "TypeError",
     "evalue": "'str' object is not callable",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-19-b62c9494b245>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     12\u001b[0m  \u001b[0mscale_pos_weight\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     13\u001b[0m  seed=27)\n\u001b[0;32m---> 14\u001b[0;31m \u001b[0mmodelfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mxgb4\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrain\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpredictors\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m<ipython-input-5-0bc6edfa63bd>\u001b[0m in \u001b[0;36mmodelfit\u001b[0;34m(alg, dtrain, predictors, useTrainCV, cv_folds, early_stopping_rounds)\u001b[0m\n\u001b[1;32m     20\u001b[0m     \u001b[0mprint\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m\"AUC Score (Train): %f\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0mmetrics\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mroc_auc_score\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdtrain\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Disbursed'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtrain_predprob\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     21\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 22\u001b[0;31m     \u001b[0mfeat_imp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSeries\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0malg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbooster\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_fscore\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msort_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mascending\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     23\u001b[0m     \u001b[0mfeat_imp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mplot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkind\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'bar'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtitle\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'Feature Importances'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     24\u001b[0m     \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mylabel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Feature Importance Score'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mTypeError\u001b[0m: 'str' object is not callable"
     ]
    }
   ],
   "source": [
    "#降低学习率，增加更多的树\n",
    "xgb4 = XGBClassifier(\n",
    " learning_rate =0.01,\n",
    " n_estimators=5000,\n",
    " max_depth=4,\n",
    " min_child_weight=6,\n",
    " gamma=0,\n",
    " subsample=0.8,\n",
    " colsample_bytree=0.8,\n",
    " reg_alpha=0.005,\n",
    " objective= 'binary:logistic',\n",
    " nthread=4,\n",
    " scale_pos_weight=1,\n",
    " seed=27)\n",
    "modelfit(xgb4, train, predictors)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 最终模型特征重要性\n",
    "\n",
    "最终模型的评分为0.886021\n",
    "特征重要性：max_depth、min_child_weight、gamma 大于 colsample_bytree、subsample 大于 reg_alpha"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
